from  urllib import request,error
import  re
'''
58要校验验证码，该脚本未实现，因此需要先在浏览器完成验证
'''
#使用urllib爬取58同城的招聘信息
url="https://hz.58.com/chuzu/?newSearch=1&key=%E4%B8%89%E5%A2%A9"

req=request.Request(url)

res=request.urlopen(url)

html=res.read().decode("utf-8")

print(html)
###测试数据
html2='''<a href="//jxjump.58.com/service?target=FCADV8oV3os7xtAhI2suhvPnTELfqafnhB4zBSrb6iqyIkHzZpk1zEffDjpdRkNz3Q5xoKYl4Bi0ja0QYcvYJ26jH_xd0r5V11gd3yBP1ZKMCGY0iFITaAgaDCTJ70NwSDxuLeEvZYetOdm4P-LGmgfXvNovkSBk_8HbPZtAUEnEhkw5O8_9E-uPIral4H5QrUvdByUl9PDblkheg3oFaEAXwdHK6Q-YyQO7rZ4Va80-ZO1mL8MQx3i05Yaoj08mTC5as&local=79&pubid=49521504&apptype=0&psid=173732394202150305616246543&entinfo=36129906701832_0&cookie=||https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DZHds-bYo5E3RHBh7MA-uAwOPOSweFgNw4EgL4ruvpDy%26wd%3D%26eqid%3Dc76263d100014647000000045bed6150|c5/njVvs8BinbZ8bA0b/Ag==&fzbref=0&key=三墩&params=jxfangchancomposite^desc" tongji_label="listclick"
                   onclick="clickLog('from=fcpc_zflist_gzcount');" target="_blank">
                    <img
                        lazy_src="//pic4.58cdn.com.cn/anjuke_58/e7fe1dc59cf6e9f740e5e0ba465caac5?w=294&h=220&crop=1"
                        src="//img.58cdn.com.cn/ui9/house/list/lazy_pic.png">
                                    </a>
                                    <span class="picNum">10 图</span>
                            </div>
            <div class="des">
                <h2>
                    <a href="//jxjump.58.com/service?target=FCADV8oV3os7xtAhI2suhvPnTELfqafnhB4zBSrb6iqyIkHzZpk1zEffDjpdRkNz3Q5xoKYl4Bi0ja0QYcvYJ26jH_xd0r5V11gd3yBP1ZKMCGY0iFITaAgaDCTJ70NwSDxuLeEvZYetOdm4P-LGmgfXvNovkSBk_8HbPZtAUEnEhkw5O8_9E-uPIral4H5QrUvdByUl9PDblkheg3oFaEAXwdHK6Q-YyQO7rZ4Va80-ZO1mL8MQx3i05Yaoj08mTC5as&local=79&pubid=49521504&apptype=0&psid=173732394202150305616246543&entinfo=36129906701832_0&cookie=||https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DZHds-bYo5E3RHBh7MA-uAwOPOSweFgNw4EgL4ruvpDy%26wd%3D%26eqid%3Dc76263d100014647000000045bed6150|c5/njVvs8BinbZ8bA0b/Ag==&fzbref=0&key=三墩&params=jxfangchancomposite^desc"
                       class="strongbox"
                       tongji_label="listclick"
                       onclick="clickLog('from=fcpc_zflist_gzcount');"
                       target="_blank"  rel="nofollow" >
                        整租 | 本人实地勘察 桔子里豪装loft户型1房 靠近地铁2                    </a>
                                            <em class="jinico"></em>
                                    </h2>
                <p class="room strongbox">1室2厅1卫                        &nbsp;&nbsp;&nbsp;&nbsp;76㎡</p>
                <p class="add">
                    <a href="/sandun/chuzu/"
                       onClick="clickLog('from=fcpc_list_hz_biaoti_shangquan')">三墩</a>
                    &nbsp;&nbsp;
                                            <a href="//hz.58.com/xiaoqu/jiezili/chuzu/"
                           target="_blank"
                           onClick="clickLog('from=fcpc_list_hz_biaoti_xiaoqu')">桔子里...出租</a>
                                                                <em></em>距离2号线墩祥街地铁站320米                                    </p>
                                    <div class="jjr">
                                                来自经纪人：                        <span class=" jjr_par">
                                                    <span class="jjr_par_dp">
                                豪世华邦租赁                            </span>
                                                    <span class="listjjr">
                                                            邹冬冬                                                        </span>
                    </span>
                    </div>
                            </div>
            <div class="listliright">
                <div class="sendTime">
                                    </div>
                <div class="money">
                    <b class="strongbox">3300</b>元/月                </div>
            </div>
            <div class="listline"></div>'''


#pat='<a href="//(.*?)".*?target="_blank">.*?<img lazy_src="//（.*?)" src="//(.*?)">.*?</a>.*?<span class="picNum">(.*?)</span>.*?</div><div class="des">.*?<a href="//(.*?)".*?target="_blank" (.*?)</a>.*?<p class="room strongbox">(.*?)</p>.*?</span>.*?</div>.*?</div>.*?<div class="listliright">.*?<div class="sendTime">.*?</div>.*?<div class="money">.*?<b class="strongbox">(.*?)</b>元/月'

pat='<a href="//(.*?)".*?\n.*?target="_blank">.*?\n.*?<img.*?\n.*?lazy_src="//(.*?)".*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?target="_blank"  rel="nofollow" >\n(.*?)</a>\n.*?\n.*?\n.*?<p class="room strongbox">(.*?)</p>\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?<b class="strongbox">(.*?)</div>'

dlist=re.findall(pat,html)
# dlist2=re.findall(pat,html2)
print("打印匹配值")
# print(dlist)
for v in dlist:
    print("房屋链接:"+v[0])
    print('图片地址：'+v[1])
    print('标题：' + v[2].replace(' ',''))
    print('户型：' + v[3].replace(' ', ''))
    print('价格：' + v[4].replace(' ', '').replace('</b>',''))
    print("\n")

# for v in dlist2:##测试数据
#     print("房屋链接:"+v[0])
#     print('图片地址：'+v[1])
#     print('标题：' + v[2].replace(' ',''))
#     print('户型：' + v[3].replace(' ', ''))
#     print('价格：' + v[4].replace(' ', '').replace('</b>',''))

###以下为requests方式爬取数据

import requests
import re

url='https://hz.58.com/chuzu/?newSearch=1&key=%E4%B8%89%E5%A2%A9'

res = requests.post(url)

html=res.content.decode(encoding='utf-8')

pat='<a href="//(.*?)".*?\n.*?target="_blank">.*?\n.*?<img.*?\n.*?lazy_src="//(.*?)".*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?target="_blank"  rel="nofollow" >\n(.*?)</a>\n.*?\n.*?\n.*?<p class="room strongbox">(.*?)</p>\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?\n.*?<b class="strongbox">(.*?)</div>'

dlist=re.findall(pat,html)
# dlist2=re.findall(pat,html2)
print("requests方式爬取数据，打印匹配值")
# print(dlist)
for v in dlist:
    print("房屋链接:"+v[0])
    print('图片地址：'+v[1])
    print('标题：' + v[2].replace(' ',''))
    print('户型：' + v[3].replace(' ', ''))
    print('价格：' + v[4].replace(' ', '').replace('</b>',''))
    print("\n")